
from sklearn.feature_extraction.text import CountVectorizer
from nd_utils.nltk_util import Nltk_util

class Title_ngram():
    def __init__(self, gram_num):
        self.ngram = gram_num
        self.ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(gram_num, gram_num))
        self.word_stem = Nltk_util()

    def do_title_ngram(self, title):
        _title = self.word_stem.stem_process(title)
        # if len(_title) >= self.ngram:
        try:
            self.ngram_vectorizer.fit_transform([_title])
            ngram_list = self.ngram_vectorizer.get_feature_names()
            return set(ngram_list)
        except:
            return set([])
